import pandas as pd
import plotly.express as px
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
# Load the diamond dataset
df = pd.read_csv("diamonds.csv")
df.head()
| Unnamed: 0 | carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 2 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 3 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 4 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 5 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
df.columns
Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
'price', 'x', 'y', 'z'],
dtype='object')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()
| carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
df.isnull().sum()
carat 0 cut 0 color 0 clarity 0 depth 0 table 0 price 0 x 0 y 0 z 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 53940 entries, 0 to 53939 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 carat 53940 non-null float64 1 cut 53940 non-null object 2 color 53940 non-null object 3 clarity 53940 non-null object 4 depth 53940 non-null float64 5 table 53940 non-null float64 6 price 53940 non-null int64 7 x 53940 non-null float64 8 y 53940 non-null float64 9 z 53940 non-null float64 dtypes: float64(6), int64(1), object(3) memory usage: 4.1+ MB
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| carat | 53940.0 | 0.797940 | 0.474011 | 0.2 | 0.40 | 0.70 | 1.04 | 5.01 |
| depth | 53940.0 | 61.749405 | 1.432621 | 43.0 | 61.00 | 61.80 | 62.50 | 79.00 |
| table | 53940.0 | 57.457184 | 2.234491 | 43.0 | 56.00 | 57.00 | 59.00 | 95.00 |
| price | 53940.0 | 3932.799722 | 3989.439738 | 326.0 | 950.00 | 2401.00 | 5324.25 | 18823.00 |
| x | 53940.0 | 5.731157 | 1.121761 | 0.0 | 4.71 | 5.70 | 6.54 | 10.74 |
| y | 53940.0 | 5.734526 | 1.142135 | 0.0 | 4.72 | 5.71 | 6.54 | 58.90 |
| z | 53940.0 | 3.538734 | 0.705699 | 0.0 | 2.91 | 3.53 | 4.04 | 31.80 |
import pandas as pd
import plotly.express as px
# Step 1: Import necessary libraries
# Step 2: Load diamond dataset into a DataFrame
df = pd.read_csv("diamonds.csv")
# Step 3: Create an animated line plot of cut vs clarity over depth
fig = px.line(df,
x="cut",
y="clarity",
animation_frame="depth",
range_x=[0, 5],
range_y=[0, 20000])
# Step 4: Add customizations if needed
# Show the interactive animation plot
fig.show()
# Create a box plot of price by cut
fig = px.box(df, x="cut", y="price")
fig.show()
df.columns
Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
'price', 'x', 'y', 'z'],
dtype='object')
import pandas as pd
import plotly.express as px
# Step 1: Import necessary libraries
# Step 2: Load diamond dataset into a DataFrame
df = pd.read_csv("diamonds.csv")
# Step 3: Create an animated scatter plot of carat vs price over time
fig = px.scatter(df,
x="carat",
y="cut",
animation_frame="table",
range_x=[0, 5],
range_y=[0, 20000])
# Step 4: Add customizations if needed
# Show the interactive animation plot
fig.show()
import plotly.express as px
fig = px.histogram(
df,
x="price",
title="Histogram of diamond prices",
width=600,
height=400,
)
fig.show()
import plotly.express as px
# Create heatmap using Plotly Express
fig = px.imshow(
df.corr(),
color_continuous_scale="Inferno_r",
)
# Show plot
fig.show()
C:\Users\alexa\AppData\Local\Temp\ipykernel_14444\2978107106.py:5: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
-- Ordinary Least Squares (OLS) trendline function. Requires statsmodels to be installed. This trendline function causes fit results to be stored within the figure, accessible via the plotly. express. get_trendline_results function.
fig=px.scatter(df, x='carat', y='price',trendline="ols",title='Carat Vs Price',
color_discrete_sequence=['red'],template='plotly_dark')
fig.show()
cut_price = df.groupby("cut").price.mean().reset_index()
px.bar(cut_price, x="cut", y='price',color='cut',title='Cut Vs Prices')
color_price = df.groupby("color").price.mean().reset_index()
px.bar(color_price, x='color', y='price',color='color',template='ggplot2',title='Color Vs Prices')
clarity_price = df.groupby("clarity").price.mean().reset_index()
clarity_price.head()
| clarity | price | |
|---|---|---|
| 0 | I1 | 3924.168691 |
| 1 | IF | 2864.839106 |
| 2 | SI1 | 3996.001148 |
| 3 | SI2 | 5063.028606 |
| 4 | VS1 | 3839.455391 |
px.bar(clarity_price, x='clarity', y='price',color_discrete_sequence=['green'],title='Clarity vs. Price')
px.scatter(df, x='depth', y='price',color_discrete_sequence=['orange'],trendline="ols",template='plotly_dark',title='Depth Vs Price')
fig = px.scatter(df, x="carat", y="price", trendline="ols",color="color",
title="Carat Vs Color Vs Price")
fig.show()
df["volume"] = df["x"] * df["y"] * df["z"]
df = df.drop(["x", "y", "z"], axis= 1)
df = df.drop(df.index[df["volume"]== 0], axis= 0)
df.head()
| Unnamed: 0 | carat | cut | color | clarity | depth | table | price | volume | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 38.202030 |
| 1 | 2 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 34.505856 |
| 2 | 3 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 38.076885 |
| 3 | 4 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 46.724580 |
| 4 | 5 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 51.917250 |
figure = px.scatter(data_frame = df, x="carat",
y="price", size="depth",
color= "cut", trendline="ols")
figure.show()
fig = px.box(df, x="cut",
y="price",
color="color")
fig.show()
fig = px.box(df,
x="cut",
y="price",
color="clarity")
fig.show()
correlation = df.corr()
print(correlation["price"].sort_values(ascending=False))
price 1.000000 carat 0.921592 volume 0.904255 table 0.127245 depth -0.010729 Unnamed: 0 -0.307092 Name: price, dtype: float64
C:\Users\alexa\AppData\Local\Temp\ipykernel_14444\3962303493.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
import pandas as pd
import plotly.express as px
# Step 1: Import necessary libraries
# Step 2: Load diamond dataset into a DataFrame
df = pd.read_csv("diamonds.csv")
# Step 3: Create an animated scatter plot of carat vs price over time
fig = px.scatter(df,
x="carat",
y="price",
animation_frame="cut",
range_x=[0, 5],
range_y=[0, 20000])
# Step 4: Add customizations if needed
# Show the interactive animation plot
fig.show()
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.stats import ttest_ind
fig = px.scatter(df, x='carat', y='price', animation_frame='cut',
range_x=[0, df['carat'].max()+1],
range_y=[0, df['price'].max()+1000],
labels={'carat': 'Carat', 'price': 'Price'},
title="Carat vs Price Relationship by Cut")
# Set layout properties for aesthetics
fig.update_layout(
showlegend=False,
width=800,
height=500,
xaxis=dict(title=dict(text='Carat')),
yaxis=dict(title=dict(text='Price'))
)
# Show the animated scatter plot
fig.show()
fig = px.scatter(df,
x="carat",
y="price",
animation_frame="clarity",
range_x=[0, 5],
range_y=[0, 20000])
fig.show()
# Statistical Tests:
significance_level = 0.05
cut_categories = df["cut"].unique()
for i in range(len(cut_categories)-1):
for j in range(i+1,len(cut_categories)):
category_1_price = df[df["cut"]==cut_categories[i]]["price"]
category_2_price = df[df["cut"]==cut_categories[j]]["price"]
t_statistic, p_value = ttest_ind(category_1_price, category_2_price)
print(f"T-test between {cut_categories[i]} and {cut_categories[j]}:")
print(f" T-Statistic: {t_statistic}")
print(f" P-value: {p_value}")
if p_value < significance_level:
print(" Null hypothesis rejected. There is a significant difference in average prices.")
else:
print(" Null hypothesis accepted. There is no significant difference in average prices.")
T-test between Ideal and Premium:
T-Statistic: -25.650910588183198
P-value: 8.585274269295433e-144
Null hypothesis rejected. There is a significant difference in average prices.
T-test between Ideal and Good:
T-Statistic: -7.871461326266651
P-value: 3.638743214736485e-15
Null hypothesis rejected. There is a significant difference in average prices.
T-test between Ideal and Very Good:
T-Statistic: -11.965879837440761
P-value: 6.255962296590797e-33
Null hypothesis rejected. There is a significant difference in average prices.
T-test between Ideal and Fair:
T-Statistic: -9.19948373061459
P-value: 3.892181603768655e-20
Null hypothesis rejected. There is a significant difference in average prices.
T-test between Premium and Good:
T-Statistic: 9.4221127821383
P-value: 4.922340877355623e-21
Null hypothesis rejected. There is a significant difference in average prices.
T-test between Premium and Very Good:
T-Statistic: 11.619045834785702
P-value: 3.9409266225202815e-31
Null hypothesis rejected. There is a significant difference in average prices.
T-test between Premium and Fair:
T-Statistic: 2.0034976676632996
P-value: 0.04514138588179719
Null hypothesis rejected. There is a significant difference in average prices.
T-test between Good and Very Good:
T-Statistic: -0.8085850944660599
P-value: 0.41876516590101354
Null hypothesis accepted. There is no significant difference in average prices.
T-test between Good and Fair:
T-Statistic: -4.098387593811541
P-value: 4.210907188664372e-05
Null hypothesis rejected. There is a significant difference in average prices.
T-test between Very Good and Fair:
T-Statistic: -3.6495164503086652
P-value: 0.00026371160550971715
Null hypothesis rejected. There is a significant difference in average prices.
import plotly.express as px
import seaborn as sns
# Load the diamond dataset from seaborn library
diamonds = sns.load_dataset('diamonds')
# Create a pivot table to calculate average price by cut and clarity
pivot_table = diamonds.pivot_table(values='price', index='cut', columns='clarity', aggfunc='mean')
# Generate the heatmap using Plotly Express
fig = px.imshow(pivot_table,
labels=dict(x="Clarity", y="Cut", color="Average Price"),
x=pivot_table.columns,
y=pivot_table.index,
color_continuous_scale='viridis',
title="Diamond Average Price Heatmap")
fig.show()
df.columns
Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
'price', 'x', 'y', 'z'],
dtype='object')
fig = px.scatter(df, x='carat', y='price', animation_frame='cut',
range_x=[0, df['carat'].max()+1],
range_y=[0, df['price'].max()+1000],
labels={'carat': 'Carat', 'price': 'Price'},
title="Carat vs Price Relationship by Cut")
# Set layout properties for aesthetics
fig.update_layout(
showlegend=False,
width=800,
height=500,
xaxis=dict(title=dict(text='Carat')),
yaxis=dict(title=dict(text='Price'))
)
# Show the animated scatter plot
fig.show()
data = df.groupby(['cut', 'color'])['price'].mean().reset_index()
import pandas as pd
import plotly.express as px
df = pd.read_csv('diamonds.csv')
grouped_df = df.groupby(['cut', 'color']).mean()['price'].reset_index()
C:\Users\alexa\AppData\Local\Temp\ipykernel_14444\1141413756.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
fig = px.imshow(grouped_df.pivot(index='color', columns='cut', values='price'),
labels=dict(x="Cut", y="Color", color="Price"),
x=['Fair', 'Good', 'Ideal', 'Premium','Very Good'],
y=['D','E','F','G','H','I','J'],
title="Diamond Price Heatmap")
fig.update_layout(
updatemenus=[
dict(
type="buttons",
buttons=[dict(label="Play",
method="animate",
args=[None,
{"frame": {"duration": 1000, "redraw": True},
"fromcurrent": True,
"transition": {"duration": 500}}]),
dict(label="Pause",
method="animate",
args=[[None],
{"frame": {"duration": 0, "redraw": False},
"mode":"immediate"}])]
)
])
frames = []
for cut in grouped_df['cut'].unique():
frame_data = grouped_df[grouped_df['cut'] == cut].pivot(index='color',
columns='cut',
values='price').values.tolist()
frames.append(dict(data=[dict(z=frame_data,
type='heatmap')],
name=cut))
fig.frames = frames
fig.show()